In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
In [25]:
import pandas as pd

df = pd.read_csv("hmeq.csv")

df.head()
Out[25]:
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
0 1 1100 25860.0 39025.0 HomeImp Other 10.5 0.0 0.0 94.366667 1.0 9.0 NaN
1 1 1300 70053.0 68400.0 HomeImp Other 7.0 0.0 2.0 121.833333 0.0 14.0 NaN
2 1 1500 13500.0 16700.0 HomeImp Other 4.0 0.0 0.0 149.466667 1.0 10.0 NaN
3 1 1500 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0 1700 97800.0 112000.0 HomeImp Office 3.0 0.0 0.0 93.333333 0.0 14.0 NaN

Data Preprocessing

In [26]:
df.describe()
Out[26]:
BAD LOAN MORTDUE VALUE YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
count 5960.000000 5960.000000 5442.000000 5848.000000 5445.000000 5252.000000 5380.000000 5652.000000 5450.000000 5738.000000 4693.000000
mean 0.199497 18607.969799 73760.817200 101776.048741 8.922268 0.254570 0.449442 179.766275 1.186055 21.296096 33.779915
std 0.399656 11207.480417 44457.609458 57385.775334 7.573982 0.846047 1.127266 85.810092 1.728675 10.138933 8.601746
min 0.000000 1100.000000 2063.000000 8000.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.524499
25% 0.000000 11100.000000 46276.000000 66075.500000 3.000000 0.000000 0.000000 115.116702 0.000000 15.000000 29.140031
50% 0.000000 16300.000000 65019.000000 89235.500000 7.000000 0.000000 0.000000 173.466667 1.000000 20.000000 34.818262
75% 0.000000 23300.000000 91488.000000 119824.250000 13.000000 0.000000 0.000000 231.562278 2.000000 26.000000 39.003141
max 1.000000 89900.000000 399550.000000 855909.000000 41.000000 10.000000 15.000000 1168.233561 17.000000 71.000000 203.312149
In [27]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   BAD      5960 non-null   int64  
 1   LOAN     5960 non-null   int64  
 2   MORTDUE  5442 non-null   float64
 3   VALUE    5848 non-null   float64
 4   REASON   5708 non-null   object 
 5   JOB      5681 non-null   object 
 6   YOJ      5445 non-null   float64
 7   DEROG    5252 non-null   float64
 8   DELINQ   5380 non-null   float64
 9   CLAGE    5652 non-null   float64
 10  NINQ     5450 non-null   float64
 11  CLNO     5738 non-null   float64
 12  DEBTINC  4693 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 605.4+ KB
In [28]:
df.isnull().sum()
Out[28]:
BAD           0
LOAN          0
MORTDUE     518
VALUE       112
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64
In [29]:
for col in df.select_dtypes(include='float64').columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts(dropna=False))
Column: MORTDUE
MORTDUE
NaN         518
42000.0      11
47000.0      10
65000.0       9
124000.0      7
           ... 
65372.0       1
15346.0       1
58549.0       1
69195.0       1
48811.0       1
Name: count, Length: 5054, dtype: int64

Column: VALUE
VALUE
NaN         112
60000.0      15
80000.0      14
85000.0      12
65000.0      11
           ... 
116994.0      1
42682.0       1
72175.0       1
70095.0       1
88934.0       1
Name: count, Length: 5382, dtype: int64

Column: YOJ
YOJ
NaN      515
0.00     415
1.00     363
2.00     347
5.00     333
        ... 
29.90      1
12.90      1
13.50      1
0.25       1
8.30       1
Name: count, Length: 100, dtype: int64

Column: DEROG
DEROG
0.0     4527
NaN      708
1.0      435
2.0      160
3.0       58
4.0       23
5.0       15
6.0       15
7.0        8
8.0        6
9.0        3
10.0       2
Name: count, dtype: int64

Column: DELINQ
DELINQ
0.0     4179
1.0      654
NaN      580
2.0      250
3.0      129
4.0       78
5.0       38
6.0       27
7.0       13
8.0        5
10.0       2
11.0       2
15.0       1
12.0       1
13.0       1
Name: count, dtype: int64

Column: CLAGE
CLAGE
NaN           308
102.500000      7
206.966667      7
177.500000      6
123.766667      6
             ... 
240.856017      1
196.241371      1
71.461705       1
184.880011      1
219.601002      1
Name: count, Length: 5315, dtype: int64

Column: NINQ
NINQ
0.0     2531
1.0     1339
2.0      780
NaN      510
3.0      392
4.0      156
5.0       75
6.0       56
7.0       44
10.0      28
8.0       22
9.0       11
11.0      10
12.0       2
13.0       2
14.0       1
17.0       1
Name: count, dtype: int64

Column: CLNO
CLNO
16.0    316
19.0    307
24.0    264
23.0    259
21.0    235
       ... 
58.0      3
71.0      2
53.0      2
57.0      1
63.0      1
Name: count, Length: 63, dtype: int64

Column: DEBTINC
DEBTINC
NaN          1267
34.964141       1
41.576701       1
41.395462       1
20.688715       1
             ... 
39.244669       1
40.943866       1
30.444839       1
36.158718       1
34.571519       1
Name: count, Length: 4694, dtype: int64
In [30]:
#DAre people with missing mortgage data more likely to default?

df.groupby(df['MORTDUE'].isna())['BAD'].mean()
Out[30]:
MORTDUE
False    0.199008
True     0.204633
Name: BAD, dtype: float64

evaluating missing values in all numerical features and compared default rates between missing and non-missing observations.

In [31]:
num_cols = df.select_dtypes(include=['float64']).columns

for col in num_cols:
    if df[col].isna().sum() > 0:
        print("="*50)
        print(f"Column: {col}")
        print("Missing %:", round(df[col].isna().mean()*100, 2))
        print(df.groupby(df[col].isna())['BAD'].mean())
==================================================
Column: MORTDUE
Missing %: 8.69
MORTDUE
False    0.199008
True     0.204633
Name: BAD, dtype: float64
==================================================
Column: VALUE
Missing %: 1.88
VALUE
False    0.185363
True     0.937500
Name: BAD, dtype: float64
==================================================
Column: YOJ
Missing %: 8.64
YOJ
False    0.206428
True     0.126214
Name: BAD, dtype: float64
==================================================
Column: DEROG
Missing %: 11.88
DEROG
False    0.209825
True     0.122881
Name: BAD, dtype: float64
==================================================
Column: DELINQ
Missing %: 9.73
DELINQ
False    0.207621
True     0.124138
Name: BAD, dtype: float64
==================================================
Column: CLAGE
Missing %: 5.17
CLAGE
False    0.196568
True     0.253247
Name: BAD, dtype: float64
==================================================
Column: NINQ
Missing %: 8.56
NINQ
False    0.204404
True     0.147059
Name: BAD, dtype: float64
==================================================
Column: CLNO
Missing %: 3.72
CLNO
False    0.197978
True     0.238739
Name: BAD, dtype: float64
==================================================
Column: DEBTINC
Missing %: 21.26
DEBTINC
False    0.085873
True     0.620363
Name: BAD, dtype: float64

Performing MICE ( Multiple Imputation by Chained Equations ) technique to impute numerical missing values

In [32]:
df.head()
Out[32]:
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
0 1 1100 25860.0 39025.0 HomeImp Other 10.5 0.0 0.0 94.366667 1.0 9.0 NaN
1 1 1300 70053.0 68400.0 HomeImp Other 7.0 0.0 2.0 121.833333 0.0 14.0 NaN
2 1 1500 13500.0 16700.0 HomeImp Other 4.0 0.0 0.0 149.466667 1.0 10.0 NaN
3 1 1500 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0 1700 97800.0 112000.0 HomeImp Office 3.0 0.0 0.0 93.333333 0.0 14.0 NaN

Checking the duplicates

In [33]:
df.duplicated().sum()
Out[33]:
0
In [34]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Select numeric float columns
num_cols = df.select_dtypes(include=['float64']).columns

# Initialize imputer
imputer = IterativeImputer(random_state=42)

# Impute and round to 2 decimals
df[num_cols] = (
    pd.DataFrame(
        imputer.fit_transform(df[num_cols]),
        columns=num_cols,
        index=df.index
    ).round(2)
)
In [35]:
df=pd.DataFrame(df)
In [36]:
df.isnull().sum()
Out[36]:
BAD          0
LOAN         0
MORTDUE      0
VALUE        0
REASON     252
JOB        279
YOJ          0
DEROG        0
DELINQ       0
CLAGE        0
NINQ         0
CLNO         0
DEBTINC      0
dtype: int64
In [37]:
df.head()
Out[37]:
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
0 1 1100 25860.00 39025.00 HomeImp Other 10.50 0.00 0.00 94.37 1.00 9.00 31.77
1 1 1300 70053.00 68400.00 HomeImp Other 7.00 0.00 2.00 121.83 0.00 14.00 33.58
2 1 1500 13500.00 16700.00 HomeImp Other 4.00 0.00 0.00 149.47 1.00 10.00 31.46
3 1 1500 70988.55 101779.25 NaN NaN 8.99 0.27 0.45 178.90 1.19 21.25 33.88
4 0 1700 97800.00 112000.00 HomeImp Office 3.00 0.00 0.00 93.33 0.00 14.00 33.95

Checking the distribution and filling the null values of categorical features with mode

In [38]:
# Import necessary libraries
import matplotlib.pyplot as plt
import pandas as pd

# Plot 1: Distribution of REASON with respect to BAD
reason_bad = pd.crosstab(df['REASON'], df['BAD'])

plt.figure()
reason_bad.plot(kind='bar')
plt.title("Distribution of BAD with Respect to REASON")
plt.xlabel("REASON")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

# Plot 2: Distribution of JOB with respect to BAD
job_bad = pd.crosstab(df['JOB'], df['BAD'])

plt.figure()
job_bad.plot(kind='bar')
plt.title("Distribution of BAD with Respect to JOB")
plt.xlabel("JOB")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()
<Figure size 640x480 with 0 Axes>
No description has been provided for this image
<Figure size 640x480 with 0 Axes>
No description has been provided for this image
In [39]:
df.groupby(df['REASON'].isna())['BAD'].mean()
Out[39]:
REASON
False    0.199895
True     0.190476
Name: BAD, dtype: float64
In [40]:
df.groupby(df['JOB'].isna())['BAD'].mean()
Out[40]:
JOB
False    0.205246
True     0.082437
Name: BAD, dtype: float64
In [41]:
#Performing simple imputation: filling the missing values with most frequent values


from sklearn.impute import SimpleImputer

# Select categorical columns
cat_cols = df.select_dtypes(include=['object']).columns

# Initialize imputer
cat_imputer = SimpleImputer(strategy='most_frequent')

# Impute and keep structure
df[cat_cols] = pd.DataFrame(
    cat_imputer.fit_transform(df[cat_cols]),
    columns=cat_cols,
    index=df.index
)
In [42]:
df.isna().sum()
Out[42]:
BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

EDA

In [43]:
for col in cat_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=col, hue='BAD')
    plt.title(f'{col} vs BAD')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    plt.savefig(f"categorical_plots/{col}_vs_BAD.png", dpi=300)
    
    plt.show()      
    plt.close()
No description has been provided for this image
No description has been provided for this image

insights from categorical variables

In [44]:
import matplotlib.pyplot as plt
import seaborn as sns

pivot_table = df.pivot_table(
    values='BAD',
    index='JOB',
    columns='REASON',
    aggfunc='mean'
)

plt.figure(figsize=(8,6))
sns.heatmap(pivot_table, annot=True, fmt=".2f", cmap="Reds")
plt.title("Default Rate Heatmap (JOB vs REASON)")

# Save directly into existing folder
plt.savefig("categorical_plots/default_rate_heatmap_JOB_vs_REASON.png",
            dpi=300, bbox_inches='tight')

plt.show()
plt.close()
No description has been provided for this image
In [45]:
df['JOB'].value_counts()
Out[45]:
JOB
Other      2667
ProfExe    1276
Office      948
Mgr         767
Self        193
Sales       109
Name: count, dtype: int64
In [46]:
import matplotlib.pyplot as plt
import pandas as pd

pd.crosstab(
    [df['JOB'], df['REASON']],
    df['BAD'],
    normalize='index'
).plot(kind='bar', stacked=True, figsize=(10,6))

plt.title("Proportion of BAD within JOB & REASON")
plt.ylabel("Proportion")

# Save the plot
plt.savefig("categorical_plots/proportion_BAD_JOB_REASON.png",
            dpi=300, bbox_inches='tight')

plt.show()
plt.close()
No description has been provided for this image
In [47]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Create folder
os.makedirs("numerical_plots", exist_ok=True)

# Identify numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target column
if 'BAD' in num_cols:
    num_cols.remove('BAD')

for col in num_cols:
    plt.figure(figsize=(6,4))
    
    sns.violinplot(data=df, x='BAD', y=col, inner='quartile')
    plt.title(f'{col} distribution by BAD')
    plt.tight_layout()
    
    # Save image
    plt.savefig(f"numerical_plots/{col}_vs_BAD.png", dpi=300)
    
    # Show image
    plt.show()
    
    # Close figure
    plt.close()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [48]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select numeric columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target from features
if 'BAD' in num_cols:
    num_cols.remove('BAD')

# Define a custom palette for clarity
palette = {0: "#1f77b4", 1: "#ff7f0e"}  # Blue for 0, Orange for 1

# Pairplot with improved colors
sns.pairplot(
    df[num_cols + ['BAD']], 
    hue='BAD', 
    diag_kind='kde', 
    palette=palette,
    markers=["o", "s"],   # Circle for 0, Square for 1
    plot_kws={'alpha': 0.6}  
)
plt.show()
No description has been provided for this image
In [49]:
import pandas as pd

num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Compute correlation matrix
corr_matrix = df[num_cols].corr()

print(corr_matrix)
              BAD      LOAN   MORTDUE     VALUE       YOJ     DEROG    DELINQ  \
BAD      1.000000 -0.075099 -0.024085 -0.028126 -0.061991  0.263514  0.339712   
LOAN    -0.075099  1.000000  0.228569  0.331892  0.104063 -0.001596 -0.025774   
MORTDUE -0.024085  0.228569  1.000000  0.896879 -0.078326 -0.047213  0.003688   
VALUE   -0.028126  0.331892  0.896879  1.000000  0.000147 -0.048287 -0.006018   
YOJ     -0.061991  0.104063 -0.078326  0.000147  1.000000 -0.069331  0.032649   
DEROG    0.263514 -0.001596 -0.047213 -0.048287 -0.069331  1.000000  0.238561   
DELINQ   0.339712 -0.025774  0.003688 -0.006018  0.032649  0.238561  1.000000   
CLAGE   -0.165805  0.086985  0.127219  0.176783  0.217904 -0.080652  0.026205   
NINQ     0.170447  0.049260  0.032215 -0.006780 -0.082396  0.195430  0.076364   
CLNO    -0.003031  0.076244  0.350993  0.271876  0.022202  0.067706  0.168269   
DEBTINC  0.146887  0.084961  0.207379  0.146039 -0.061950  0.033934  0.069509   

            CLAGE      NINQ      CLNO   DEBTINC  
BAD     -0.165805  0.170447 -0.003031  0.146887  
LOAN     0.086985  0.049260  0.076244  0.084961  
MORTDUE  0.127219  0.032215  0.350993  0.207379  
VALUE    0.176783 -0.006780  0.271876  0.146039  
YOJ      0.217904 -0.082396  0.022202 -0.061950  
DEROG   -0.080652  0.195430  0.067706  0.033934  
DELINQ   0.026205  0.076364  0.168269  0.069509  
CLAGE    1.000000 -0.121013  0.247371 -0.041768  
NINQ    -0.121013  1.000000  0.090880  0.169463  
CLNO     0.247371  0.090880  1.000000  0.212974  
DEBTINC -0.041768  0.169463  0.212974  1.000000  
In [50]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, 
            annot=True, 
            cmap='coolwarm', 
            fmt='.2f', 
            linewidths=0.5)

plt.title("Correlation Matrix")
plt.show()
No description has been provided for this image
In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math

num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Folder to save boxplots
save_dir = "boxplots"
os.makedirs(save_dir, exist_ok=True)  

# Number of plots per row
n_cols = 3
n_rows = math.ceil(len(num_cols) / n_cols)

# Set figure size
plt.figure(figsize=(5*n_cols, 5*n_rows))

# Loop through numerical columns
for i, col in enumerate(num_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(y=df[col], color='lightblue')
    plt.title(col)
    plt.tight_layout()

    # Save individual boxplot
    plt.savefig(os.path.join(save_dir, f"{col}_boxplot.png"))

plt.show()
No description has been provided for this image

Splitting the data in train test split and then will perform feature engineering to prevent data leakage

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split

target_col = 'BAD'

# Features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training features shape:", X_train.shape)
print("Test features shape:", X_test.shape)
print("Training target distribution:\n", y_train.value_counts(normalize=True))
Training features shape: (4768, 12)
Test features shape: (1192, 12)
Training target distribution:
 BAD
0    0.800545
1    0.199455
Name: proportion, dtype: float64

Apply DBSCAN based outlier detection and removal technique

In [53]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Copy training data
X_train_dbscan = X_train.copy()
y_train_dbscan = y_train.copy()

# Select numerical columns
num_cols = X_train_dbscan.select_dtypes(include=['int64', 'float64']).columns

# Standardize numerical features 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_dbscan[num_cols])

# Apply DBSCAN
dbscan = DBSCAN(eps=2, min_samples=5)  
labels = dbscan.fit_predict(X_scaled)

# -1 label means outlier
outliers = labels == -1
print("Number of detected outliers:", outliers.sum())

# Keep only those values which don't have -1 as labels
X_train_clean = X_train_dbscan[labels != -1]
y_train_clean = y_train_dbscan[labels != -1]

print("Shape of cleaned training features:", X_train_clean.shape)
print("Shape of cleaned training target:", y_train_clean.shape)
Number of detected outliers: 192
Shape of cleaned training features: (4576, 12)
Shape of cleaned training target: (4576,)
In [54]:
X_train_clean.head()
Out[54]:
LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
609 7700 70451.00 81862.0 DebtCon Sales 3.0 0.00 0.00 141.27 0.00 31.0 31.68
4015 21000 48735.00 71694.0 DebtCon Other 8.0 0.00 0.00 48.50 3.00 10.0 32.92
1591 11500 63136.00 81099.0 DebtCon Other 3.0 0.46 0.69 149.06 1.42 35.0 28.98
1127 9900 55342.00 72357.0 DebtCon Mgr 7.0 0.66 3.00 112.00 1.00 11.0 39.87
920 9000 47350.86 105000.0 DebtCon ProfExe 6.0 0.00 1.00 227.27 0.00 10.0 30.62
In [55]:
X_train_clean.isnull().sum()
Out[55]:
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64

Taking boxplot of numerical values after removing missing values to check how much data have been lost

In [56]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math

# Numerical columns in cleaned training data
num_cols = X_train_clean.select_dtypes(include=['int64', 'float64']).columns

# Folder to save boxplots
save_dir = "boxplots_dbscan"
os.makedirs(save_dir, exist_ok=True)

# Number of plots per row
n_cols = 3
n_rows = math.ceil(len(num_cols) / n_cols)

# Set figure size
plt.figure(figsize=(5*n_cols, 5*n_rows))

# Loop through numerical columns to plot
for i, col in enumerate(num_cols, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.boxplot(y=X_train_clean[col], color='lightgreen')
    plt.title(col)
    plt.tight_layout()
    
    # Save individual boxplot
    plt.savefig(os.path.join(save_dir, f"{col}_boxplot.png"))

# Show all boxplots together
plt.show()
No description has been provided for this image

Checking the number of unique values in categorical variables

In [57]:
cat_cols = df.select_dtypes(include=['object', 'category']).columns

for col in cat_cols:
    print(f"Column: {col}")
    print(df[col].value_counts())
    print("-" * 50)
Column: REASON
REASON
DebtCon    4180
HomeImp    1780
Name: count, dtype: int64
--------------------------------------------------
Column: JOB
JOB
Other      2667
ProfExe    1276
Office      948
Mgr         767
Self        193
Sales       109
Name: count, dtype: int64
--------------------------------------------------
In [58]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Numerical and categorical columns in cleaned training set
num_cols = X_train_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train_clean.select_dtypes(include=['object', 'category']).columns.tolist()

# Pipeline for numerical features: only standardize
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Pipeline for categorical features: one-hot encode
cat_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Fit pipeline on training data
X_train_processed = preprocessor.fit_transform(X_train_clean)

# Transform test data using the same pipeline
X_test_processed = preprocessor.transform(X_test)

print("Training data processed shape:", X_train_processed.shape)
print("Test data processed shape:", X_test_processed.shape)
Training data processed shape: (4576, 18)
Test data processed shape: (1192, 18)
In [59]:
import pandas as pd

# For numerical + one-hot encoded categorical columns, we can use dummy names
num_cols = X_train_clean.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train_clean.select_dtypes(include=['object', 'category']).columns.tolist()

# OneHotEncoder expands categorical columns
cat_cols_expanded = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)

all_cols = np.concatenate([num_cols, cat_cols_expanded])

# Convert processed array to DataFrame
X_train_df = pd.DataFrame(X_train_processed, columns=all_cols)

# check for missing values per column
X_train_df.isnull().sum()
Out[59]:
LOAN              0
MORTDUE           0
VALUE             0
YOJ               0
DEROG             0
DELINQ            0
CLAGE             0
NINQ              0
CLNO              0
DEBTINC           0
REASON_DebtCon    0
REASON_HomeImp    0
JOB_Mgr           0
JOB_Office        0
JOB_Other         0
JOB_ProfExe       0
JOB_Sales         0
JOB_Self          0
dtype: int64
In [60]:
X_train_df.head()
Out[60]:
LOAN MORTDUE VALUE YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC REASON_DebtCon REASON_HomeImp JOB_Mgr JOB_Office JOB_Other JOB_ProfExe JOB_Sales JOB_Self
0 -0.967113 0.016154 -0.345962 -0.824932 -0.369110 -0.454279 -0.456708 -0.737183 1.024241 -0.305673 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 0.246253 -0.492382 -0.540519 -0.134014 -0.369110 -0.454279 -1.590043 1.244222 -1.126358 -0.114627 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
2 -0.620437 -0.155145 -0.360561 -0.824932 0.477641 0.408898 -0.361541 0.200682 1.433879 -0.721661 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
3 -0.766406 -0.337662 -0.527833 -0.272197 0.845793 3.298663 -0.814288 -0.076714 -1.023948 0.956157 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
4 -0.848513 -0.524795 0.096766 -0.410381 -0.369110 0.796702 0.593920 -0.737183 -1.126358 -0.468987 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
In [ ]:
 
In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

try:
    from xgboost import XGBClassifier
    xgb_available = True
except:
    xgb_available = False

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200, random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB()
}

if xgb_available:
    models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Store training results
train_results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train_processed, y_train_clean)
    
    # Predictions
    y_train_pred = model.predict(X_train_processed)
    
    # Metrics
    train_acc = (y_train_pred == y_train_clean).mean()
    class_report = classification_report(y_train_clean, y_train_pred)
    cm = confusion_matrix(y_train_clean, y_train_pred)
    
    # Store results
    train_results.append({
        "Model": model_name,
        "Train Accuracy": train_acc,
        "Model Object": model
    })
    
    # Print metrics
    print(f"--- {model_name} Training ---")
    print("Training Accuracy:", train_acc)
    print("Classification Report:\n", class_report)
    print("Confusion Matrix:\n", cm)
    
    # Plot confusion matrix
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} - Training Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    print("\n")

# Create summary DataFrame
train_results_df = pd.DataFrame(train_results).drop(columns=['Model Object'])
print("Summary of Training Accuracy:")
print(train_results_df)
--- Logistic Regression Training ---
Training Accuracy: 0.8435314685314685
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.91      3814
           1       0.60      0.18      0.28       762

    accuracy                           0.84      4576
   macro avg       0.73      0.58      0.60      4576
weighted avg       0.81      0.84      0.81      4576

Confusion Matrix:
 [[3721   93]
 [ 623  139]]
No description has been provided for this image

--- Random Forest Training ---
Training Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3814
           1       1.00      1.00      1.00       762

    accuracy                           1.00      4576
   macro avg       1.00      1.00      1.00      4576
weighted avg       1.00      1.00      1.00      4576

Confusion Matrix:
 [[3814    0]
 [   0  762]]
No description has been provided for this image

--- Gradient Boosting Training ---
Training Accuracy: 0.930506993006993
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      3814
           1       0.94      0.62      0.75       762

    accuracy                           0.93      4576
   macro avg       0.93      0.81      0.85      4576
weighted avg       0.93      0.93      0.92      4576

Confusion Matrix:
 [[3784   30]
 [ 288  474]]
No description has been provided for this image

--- SVM (RBF Kernel) Training ---
Training Accuracy: 0.8940122377622378
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      3814
           1       0.96      0.38      0.54       762

    accuracy                           0.89      4576
   macro avg       0.93      0.69      0.74      4576
weighted avg       0.90      0.89      0.87      4576

Confusion Matrix:
 [[3803   11]
 [ 474  288]]
No description has been provided for this image

--- KNN Training ---
Training Accuracy: 0.927666083916084
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96      3814
           1       0.97      0.58      0.73       762

    accuracy                           0.93      4576
   macro avg       0.95      0.79      0.84      4576
weighted avg       0.93      0.93      0.92      4576

Confusion Matrix:
 [[3801   13]
 [ 318  444]]
No description has been provided for this image

--- Naive Bayes Training ---
Training Accuracy: 0.8122814685314685
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.91      0.89      3814
           1       0.42      0.35      0.38       762

    accuracy                           0.81      4576
   macro avg       0.65      0.63      0.64      4576
weighted avg       0.80      0.81      0.80      4576

Confusion Matrix:
 [[3453  361]
 [ 498  264]]
No description has been provided for this image

--- XGBoost Training ---
Training Accuracy: 0.9995629370629371
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3814
           1       1.00      1.00      1.00       762

    accuracy                           1.00      4576
   macro avg       1.00      1.00      1.00      4576
weighted avg       1.00      1.00      1.00      4576

Confusion Matrix:
 [[3814    0]
 [   2  760]]
No description has been provided for this image

Summary of Training Accuracy:
                 Model  Train Accuracy
0  Logistic Regression        0.843531
1        Random Forest        1.000000
2    Gradient Boosting        0.930507
3     SVM (RBF Kernel)        0.894012
4                  KNN        0.927666
5          Naive Bayes        0.812281
6              XGBoost        0.999563
In [62]:
train_results_df
Out[62]:
Model Train Accuracy
0 Logistic Regression 0.843531
1 Random Forest 1.000000
2 Gradient Boosting 0.930507
3 SVM (RBF Kernel) 0.894012
4 KNN 0.927666
5 Naive Bayes 0.812281
6 XGBoost 0.999563
In [ ]:
 
In [63]:
print(type(X_test_processed))
<class 'numpy.ndarray'>
In [64]:
print("X_test shape:", X_test.shape)
print("X_test_processed shape:", X_test_processed.shape)
X_test shape: (1192, 12)
X_test_processed shape: (1192, 18)
In [65]:
import pandas as pd

# Get expanded categorical column names
cat_cols_expanded = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(cat_cols)

# Combine with numeric column names
all_cols = np.concatenate([num_cols, cat_cols_expanded])

# Convert to DataFrame
X_test_df = pd.DataFrame(X_test_processed, columns=all_cols)

X_test_df.head()
Out[65]:
LOAN MORTDUE VALUE YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC REASON_DebtCon REASON_HomeImp JOB_Mgr JOB_Office JOB_Other JOB_ProfExe JOB_Sales JOB_Self
0 0.620298 -0.666164 -0.546967 1.386006 -0.36911 -0.454279 0.060788 -0.076714 0.102556 0.000926 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
1 0.118530 -0.854207 -0.729374 0.695088 -0.36911 -0.454279 -0.723030 -0.076714 -0.409492 0.634152 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
2 -0.666052 -0.354078 -0.655114 -0.548565 -0.36911 -0.454279 1.765737 -0.737183 -1.126358 -0.056080 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
3 -0.793775 -0.574226 -0.761672 -0.963115 -0.36911 0.796702 -1.167714 -0.737183 -0.204673 0.737378 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
4 -1.304666 -0.275418 -0.583647 -0.755840 -0.36911 0.796702 -0.121973 -0.737183 -0.307082 -0.094598 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0
In [66]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Store testing results
test_results = []

for result in train_results:
    model_name = result["Model"]
    model = result["Model Object"]
    
    # Predict on test data
    y_test_pred = model.predict(X_test_processed)
    
    # Metrics
    test_acc = accuracy_score(y_test, y_test_pred)
    class_report = classification_report(y_test, y_test_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_test_pred)
    
    # Store results (weighted avg metrics)
    test_results.append({
        "Model": model_name,
        "Test Accuracy": test_acc,
        "Precision (Weighted)": class_report['weighted avg']['precision'],
        "Recall (Weighted)": class_report['weighted avg']['recall'],
        "F1-Score (Weighted)": class_report['weighted avg']['f1-score']
    })
    
    # Print metrics
    print(f"--- {model_name} Testing ---")
    print("Test Accuracy:", test_acc)
    print("Classification Report:\n", classification_report(y_test, y_test_pred))
    print("Confusion Matrix:\n", cm)
    
    # Plot confusion matrix
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
    plt.title(f'{model_name} - Test Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    print("\n")

# Summary DataFrame
test_results_df = pd.DataFrame(test_results)

# Sort by accuracy (best model on top)
test_results_df = test_results_df.sort_values(by="Test Accuracy", ascending=False)

print("Summary of Test Performance:")
print(test_results_df)
--- Logistic Regression Testing ---
Test Accuracy: 0.8355704697986577
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90       954
           1       0.73      0.28      0.40       238

    accuracy                           0.84      1192
   macro avg       0.79      0.63      0.65      1192
weighted avg       0.82      0.84      0.80      1192

Confusion Matrix:
 [[930  24]
 [172  66]]
No description has been provided for this image

--- Random Forest Testing ---
Test Accuracy: 0.9068791946308725
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.94       954
           1       0.98      0.55      0.70       238

    accuracy                           0.91      1192
   macro avg       0.94      0.77      0.82      1192
weighted avg       0.91      0.91      0.90      1192

Confusion Matrix:
 [[951   3]
 [108 130]]
No description has been provided for this image

--- Gradient Boosting Testing ---
Test Accuracy: 0.8808724832214765
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93       954
           1       0.86      0.48      0.62       238

    accuracy                           0.88      1192
   macro avg       0.87      0.73      0.77      1192
weighted avg       0.88      0.88      0.87      1192

Confusion Matrix:
 [[935  19]
 [123 115]]
No description has been provided for this image

--- SVM (RBF Kernel) Testing ---
Test Accuracy: 0.8598993288590604
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92       954
           1       0.97      0.31      0.47       238

    accuracy                           0.86      1192
   macro avg       0.91      0.65      0.69      1192
weighted avg       0.88      0.86      0.83      1192

Confusion Matrix:
 [[952   2]
 [165  73]]
No description has been provided for this image

--- KNN Testing ---
Test Accuracy: 0.8800335570469798
Classification Report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       954
           1       0.97      0.41      0.58       238

    accuracy                           0.88      1192
   macro avg       0.92      0.70      0.75      1192
weighted avg       0.89      0.88      0.86      1192

Confusion Matrix:
 [[951   3]
 [140  98]]
No description has been provided for this image

--- Naive Bayes Testing ---
Test Accuracy: 0.8062080536912751
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88       954
           1       0.52      0.40      0.45       238

    accuracy                           0.81      1192
   macro avg       0.69      0.65      0.67      1192
weighted avg       0.79      0.81      0.80      1192

Confusion Matrix:
 [[866  88]
 [143  95]]
No description has been provided for this image

--- XGBoost Testing ---
Test Accuracy: 0.8968120805369127
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94       954
           1       0.89      0.55      0.68       238

    accuracy                           0.90      1192
   macro avg       0.89      0.77      0.81      1192
weighted avg       0.90      0.90      0.89      1192

Confusion Matrix:
 [[937  17]
 [106 132]]
No description has been provided for this image

Summary of Test Performance:
                 Model  Test Accuracy  Precision (Weighted)  \
1        Random Forest       0.906879              0.913876   
6              XGBoost       0.896812              0.895881   
2    Gradient Boosting       0.880872              0.878645   
4                  KNN       0.880034              0.891368   
3     SVM (RBF Kernel)       0.859899              0.876452   
0  Logistic Regression       0.835570              0.821840   
5          Naive Bayes       0.806208              0.790559   

   Recall (Weighted)  F1-Score (Weighted)  
1           0.906879             0.896130  
6           0.896812             0.887246  
2           0.880872             0.867299  
4           0.880034             0.859811  
3           0.859899             0.828933  
0           0.835570             0.804392  
5           0.806208             0.796264  
In [67]:
test_results_df
Out[67]:
Model Test Accuracy Precision (Weighted) Recall (Weighted) F1-Score (Weighted)
1 Random Forest 0.906879 0.913876 0.906879 0.896130
6 XGBoost 0.896812 0.895881 0.896812 0.887246
2 Gradient Boosting 0.880872 0.878645 0.880872 0.867299
4 KNN 0.880034 0.891368 0.880034 0.859811
3 SVM (RBF Kernel) 0.859899 0.876452 0.859899 0.828933
0 Logistic Regression 0.835570 0.821840 0.835570 0.804392
5 Naive Bayes 0.806208 0.790559 0.806208 0.796264

Hyperparameter optimized code

In [68]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Parameter grid
rf_param_grid = {
    'n_estimators': [200, 300, 500, 700],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Generate random parameter combinations
n_iter = 30
param_list = list(ParameterSampler(rf_param_grid, n_iter=n_iter, random_state=42))

best_score = -np.inf
best_params = None

print("Starting Random Forest hyperparameter tuning...\n")

# Progress bar loop
for params in tqdm(param_list, desc="Tuning Progress"):
    
    rf_model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)
    
    scores = cross_val_score(
        rf_model,
        X_train_processed,
        y_train_clean,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1
    )
    
    mean_score = scores.mean()
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print("\nBest Parameters:", best_params)
print("Best CV Score:", best_score)

# Train best model on full training data
rf_best = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
rf_best.fit(X_train_processed, y_train_clean)

# Evaluate on test set
y_pred_rf = rf_best.predict(X_test_processed)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_rf)

print("\nConfusion Matrix:\n", cm)

plt.figure(figsize=(6,5))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=np.unique(y_test),
    yticklabels=np.unique(y_test)
)
plt.title("Random Forest - Test Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting Random Forest hyperparameter tuning...

Tuning Progress: 100%|██████████| 30/30 [10:36<00:00, 21.21s/it]
Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}
Best CV Score: 0.91991748068158

Test Accuracy: 0.9119127516778524
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.99      0.95       954
           1       0.95      0.59      0.73       238

    accuracy                           0.91      1192
   macro avg       0.93      0.79      0.84      1192
weighted avg       0.92      0.91      0.90      1192


Confusion Matrix:
 [[947   7]
 [ 98 140]]
No description has been provided for this image
In [69]:
feature_names = preprocessor.get_feature_names_out()

Checking the most important features that contributed to the final model of RandomForest

In [70]:
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': rf_best.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance.head(10))
        Feature  Importance
9  num__DEBTINC    0.146323
6    num__CLAGE    0.118481
0     num__LOAN    0.105323
8     num__CLNO    0.096939
2    num__VALUE    0.095921
1  num__MORTDUE    0.092169
5   num__DELINQ    0.086227
3      num__YOJ    0.071277
4    num__DEROG    0.055741
7     num__NINQ    0.052061
In [ ]:
 
In [71]:
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Parameter grid
xgb_param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'min_child_weight': [1, 3, 5]
}

# Generate random combinations
n_iter = 30
param_list = list(ParameterSampler(xgb_param_grid, n_iter=n_iter, random_state=42))

best_score = -np.inf
best_params = None

print("Starting XGBoost hyperparameter tuning...\n")

for params in tqdm(param_list, desc="Tuning Progress"):

    model = XGBClassifier(
        **params,
        eval_metric='logloss',
        random_state=42,
        use_label_encoder=False,
        n_jobs=-1
    )

    scores = cross_val_score(
        model,
        X_train_processed,
        y_train_clean,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1
    )

    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print("\nBest XGB Parameters:", best_params)
print("Best XGB CV Score:", best_score)

# Train best model on full training data
xgb_best = XGBClassifier(
    **best_params,
    eval_metric='logloss',
    random_state=42,
    use_label_encoder=False,
    n_jobs=-1
)

xgb_best.fit(X_train_processed, y_train_clean)

# Evaluate on test set
y_pred_xgb = xgb_best.predict(X_test_processed)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_xgb)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title("XGBoost - Test Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting XGBoost hyperparameter tuning...

Tuning Progress: 100%|██████████| 30/30 [00:17<00:00,  1.72it/s]
Best XGB Parameters: {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 0.8}
Best XGB CV Score: 0.9172260413327648

Test Accuracy: 0.9077181208053692
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.95       954
           1       0.94      0.57      0.71       238

    accuracy                           0.91      1192
   macro avg       0.92      0.78      0.83      1192
weighted avg       0.91      0.91      0.90      1192

No description has been provided for this image
In [72]:
from sklearn.ensemble import GradientBoostingClassifier

gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_list = list(ParameterSampler(gb_param_grid, n_iter=20, random_state=42))

best_score = -np.inf
best_params = None

print("Starting Gradient Boosting tuning...\n")

for params in tqdm(param_list, desc="GB Tuning"):

    model = GradientBoostingClassifier(**params, random_state=42)

    scores = cross_val_score(
        model,
        X_train_processed,
        y_train_clean,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1
    )

    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print("\nBest GB Parameters:", best_params)
print("Best CV Score:", best_score)

gb_best = GradientBoostingClassifier(**best_params, random_state=42)
gb_best.fit(X_train_processed, y_train_clean)

y_pred = gb_best.predict(X_test_processed)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges')
plt.title("Gradient Boosting - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting Gradient Boosting tuning...

GB Tuning: 100%|██████████| 20/20 [01:22<00:00,  4.10s/it]
Best GB Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 7, 'learning_rate': 0.1}
Best CV Score: 0.9157983124946376

Test Accuracy: 0.9077181208053692
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.99      0.94       954
           1       0.93      0.58      0.72       238

    accuracy                           0.91      1192
   macro avg       0.92      0.79      0.83      1192
weighted avg       0.91      0.91      0.90      1192

No description has been provided for this image
In [73]:
from sklearn.neighbors import KNeighborsClassifier

knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

param_list = list(ParameterSampler(knn_param_grid, n_iter=15, random_state=42))

best_score = -np.inf
best_params = None

print("Starting KNN tuning...\n")

for params in tqdm(param_list, desc="KNN Tuning"):

    model = KNeighborsClassifier(**params)

    scores = cross_val_score(
        model,
        X_train_processed,
        y_train_clean,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1
    )

    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print("\nBest KNN Parameters:", best_params)
print("Best CV Score:", best_score)

knn_best = KNeighborsClassifier(**best_params)
knn_best.fit(X_train_processed, y_train_clean)

y_pred = knn_best.predict(X_test_processed)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples')
plt.title("KNN - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting KNN tuning...

KNN Tuning: 100%|██████████| 15/15 [00:02<00:00,  7.09it/s]
Best KNN Parameters: {'weights': 'distance', 'n_neighbors': 3, 'metric': 'manhattan'}
Best CV Score: 0.9115521742087347

Test Accuracy: 0.910234899328859
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       954
           1       0.99      0.55      0.71       238

    accuracy                           0.91      1192
   macro avg       0.95      0.78      0.83      1192
weighted avg       0.92      0.91      0.90      1192

No description has been provided for this image
In [74]:
from sklearn.linear_model import LogisticRegression

log_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

param_list = list(ParameterSampler(log_param_grid, n_iter=10, random_state=42))

best_score = -np.inf
best_params = None

print("Starting Logistic Regression tuning...\n")

for params in tqdm(param_list, desc="LogReg Tuning"):

    model = LogisticRegression(**params, max_iter=1000, random_state=42)

    scores = cross_val_score(
        model,
        X_train_processed,
        y_train_clean,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1
    )

    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print("\nBest Logistic Parameters:", best_params)
print("Best CV Score:", best_score)

log_best = LogisticRegression(**best_params, max_iter=1000, random_state=42)
log_best.fit(X_train_processed, y_train_clean)

y_pred = log_best.predict(X_test_processed)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.title("Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting Logistic Regression tuning...

LogReg Tuning: 100%|██████████| 10/10 [00:00<00:00, 13.75it/s]
Best Logistic Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 10}
Best CV Score: 0.8046959306886496

Test Accuracy: 0.8338926174496645
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90       954
           1       0.72      0.28      0.40       238

    accuracy                           0.83      1192
   macro avg       0.78      0.63      0.65      1192
weighted avg       0.82      0.83      0.80      1192

No description has been provided for this image
In [75]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# MLP Hyperparameter Grid
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01],   
    'learning_rate_init': [0.001, 0.01],
    'solver': ['adam']
}

param_list = list(ParameterSampler(mlp_param_grid, n_iter=10, random_state=42))

best_score = -np.inf
best_params = None

print("Starting MLP tuning...\n")

for params in tqdm(param_list, desc="MLP Tuning"):

    model = MLPClassifier(**params, max_iter=500, random_state=42)

    scores = cross_val_score(
        model,
        X_train_processed,
        y_train_clean,
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1
    )

    mean_score = scores.mean()

    if mean_score > best_score:
        best_score = mean_score
        best_params = params

print("\nBest MLP Parameters:", best_params)
print("Best CV Score:", best_score)

# Train best model
mlp_best = MLPClassifier(**best_params, max_iter=500, random_state=42)
mlp_best.fit(X_train_processed, y_train_clean)

# Test prediction
y_pred = mlp_best.predict(X_test_processed)

print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds')
plt.title("MLP - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
Starting MLP tuning...

MLP Tuning: 100%|██████████| 10/10 [01:39<00:00,  9.92s/it]
Best MLP Parameters: {'solver': 'adam', 'learning_rate_init': 0.001, 'hidden_layer_sizes': (100,), 'alpha': 0.0001, 'activation': 'tanh'}
Best CV Score: 0.915321407724672

Test Accuracy: 0.9026845637583892
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       954
           1       0.87      0.61      0.71       238

    accuracy                           0.90      1192
   macro avg       0.89      0.79      0.83      1192
weighted avg       0.90      0.90      0.90      1192

No description has been provided for this image
In [ ]: